; Bilinear filtering, real Phongs shading and glass like parallel.
; Implemented in FASM by Maciej Guba.
; http://macgub.co.pl
ROUND2 equ 10

glass_tex_tri:
;include 'labs.inc'  ; line exists coz profiling purposes
;----Procedure render Phongs shaded triangle with z coord
;----interpolation (Catmull algorithm), each pixel is ---
;----covered by texture using bilinear filtering.--------
;----Bump mapping effect as option.----------------------
;----This proc -> (sorting cooeficients, two calling ----
;----loops) is used as calling frame intial code for ----
;----other renedring models.. ---------------------------
;----I normalize normal vector in every pixel -----------
;-------in - eax - x1 shl 16 + y1 -----------------------
;----------- ebx - x2 shl 16 + y2 -----------------------
;----------- ecx - x3 shl 16 + y3 -----------------------
;----------- esi - pointer to stencil buffer-------------
;-----------   filled with dd float variables------------
;----------- edi  - pointer to screen buffer-------------
;----------- edx  - pointer to texture-------------------
;----------- xmm0 - 1st normal vector -------------------
;----------- xmm1 - 2cond normal vector -----------------
;----------- xmm2 - 3rd normal vector -------------------
;----------- xmm3 - tex coords tx3, ty3 as --------------
;------------------ float dwords ------------------------
;----------- xmm4 - lo -> hi z1, z2, z3 coords ----------
;-----------  as dwords floats --------------------------
;----------- xmm5 - lo -> hi y_min, y_max, --------------
;-----------  x_min, x_max as dword integers  -----------
;----------- xmm6 - tex 4 dwords lo -> hi ---------------
;-----------  tx1, ty1, tx2, ty2 as floats---------------
;----------- xmm7 - horizontal line proc address --------
;----------- (depends upon chosen rendering model) and --
;----------- texture parameters (shift, width, size) ----
;----------- stack - no parameters ----------------------
;--------------------------------------------------------
;------ procedure don't save registers !! ---------------
  push    ebp
  mov     ebp,esp
  sub     esp,350
  and     ebp,0xfffffff0
  sub     ebp,(128)

  .dn12   equ [ebp+112]
  .1_nv   equ [ebp+96]
  .2_nv   equ [ebp+80]
  .3_nv   equ [ebp+64]

  .z3     equ [ebp+56]
  .z2     equ [ebp+52]
  .z1     equ [ebp+48]
  .x1     equ [ebp+46]
  .y1     equ [ebp+44]
  .x2     equ [ebp+42]
  .y2     equ [ebp+40]
  .x3     equ [ebp+38]
  .y3     equ [ebp+36]
  .hlp    equ [ebp+34]
  .Zbuf   equ [ebp+32]

  .x_max  equ [ebp+28]
  .x_min  equ [ebp+24]
  .y_max  equ [ebp+20]
  .y_min  equ [ebp+16]
  .l_ptr  equ dword [ebp+128]
  .dx12   equ [ebp+8]
  .dx13   equ [ebp+4]
  .dx23   equ [ebp]

  .dn13   equ [ebp-16]
  .dn23   equ [ebp-32]
  .cnv1   equ [ebp-48]  ; cur normal vectors
  .cnv2   equ [ebp-64]

  .dz13   equ [ebp-72]
  .dty13  equ [ebp-76]
  .dtx13  equ [ebp-80]

  .dz12   equ [ebp-88]
  .dty12  equ [ebp-92]
  .dtx12  equ [ebp-96]

  .dz23   equ [ebp-104]
  .dty23  equ [ebp-108]
  .dtx23  equ [ebp-112]

  .cz1    equ [ebp-120]
  .cty1   equ [ebp-124]
  .ctx1   equ [ebp-128]

  .cz2    equ [ebp-136]
  .cty2   equ [ebp-140]
  .ctx2   equ [ebp-144]


 ; .ty3    equ [ebp-152]
 ; .tx3    equ [ebp-156]
 ; .ty2    equ [ebp-160]
 ; .tx2    equ [ebp-164]
 ; .ty1    equ [ebp-168]
 ; .tx1    equ [ebp-172]

  .txX_ptr equ [ebp-176]

  .First_xres  equ [ebp-48]      ; like .cnv curr value,
  .call2       equ [ebp+96]      ; like 1_nv
  .tex_ptr2    equ [ebp+100]     ; ..=..
  .x_res       equ [ebp+64]      ; like 3_nv
  .tex_shift   equ dword[ebp+68] ; ..=..
  .tex_x4      equ dword[ebp+72]
  .tex_size    equ dword[ebp+76] ; ..=..

  .cz1d   equ [edx+56]
  .cty1d  equ [edx+52]
  .ctx1d  equ [edx+48]


  .cz2d   equ [edx+40]
  .cty2d  equ [edx+36]
  .ctx2d  equ [edx+32]

  .ty3d   equ [edx+24]
  .tx3d   equ [edx+20]
  .ty2d   equ [edx+16]
  .tx2d   equ [edx+12]
  .ty1d   equ [edx+8]
  .tx1d   equ [edx+4]
  .txX_ptrd equ [edx]

       mov      .txX_ptr,edx
       lea      edx,.txX_ptr
       movss    .l_ptr,xmm7
       movups   .First_xres,xmm7   ; texture params

    .sort3:                        ; sort triangle coordinates...
       cmp      ax,bx
       jle      .sort1
       xchg     eax,ebx
       shufps   xmm4,xmm4,11100001b
       shufps   xmm6,xmm6,01001110b
       movaps   xmm7,xmm0
       movaps   xmm0,xmm1
       movaps   xmm1,xmm7
    .sort1:
       cmp      bx,cx
       jle      .sort2
       xchg     ebx,ecx
       shufps   xmm4,xmm4,11011000b
       movaps   xmm7,xmm6
       movlhps  xmm6,xmm3
       movhlps  xmm3,xmm7
       movaps   xmm7,xmm1
       movaps   xmm1,xmm2
       movaps   xmm2,xmm7
       jmp     .sort3
     .sort2:
       movups  .tx1d,xmm6
       movlps  .tx3d,xmm3
       movaps  .z1,xmm4
       mov     .y1,eax
       mov     .y2,ebx
       mov     .y3,ecx

       movaps  .y_min,xmm5
    if 0                            ; check if at last only fragment
       packssdw xmm5,xmm5       ; of triangle is in visable area
       pshuflw  xmm5,xmm5,11011000b
       movups   xmm7,.y3
       movaps   xmm6,xmm5
       shufps   xmm5,xmm5,0  ; xmm5 lo-hi -> broadcasted y_min, x_min
       shufps   xmm6,xmm6,01010101b ;xmm6 -> brd y_max x_max
       movaps   xmm4,xmm7
       pcmpgtw  xmm7,xmm5
       pcmpgtw  xmm4,xmm6
       xorps    xmm7,xmm4
       pmovmskb eax,xmm7
       and      eax,0x00aaaaaa
       or       eax,eax
       jz       .rpt_loop2_end
    end if
       movaps   .1_nv,xmm0
       movaps   .2_nv,xmm1
       movaps   .3_nv,xmm2

       movups   xmm1,.hlp
       movups   xmm3,.y3
       xorps    xmm0,xmm0
       xorps    xmm2,xmm2
       psrld    xmm1,16
       psrld    xmm3,16
       pcmpgtw  xmm0,xmm1
       pcmpgtw  xmm2,xmm3
       pslld    xmm0,16
       pslld    xmm2,16
       orps     xmm1,xmm0
       orps     xmm3,xmm2
       cvtdq2ps xmm1,xmm1
       cvtdq2ps xmm3,xmm3
       mov      eax,1 shl ROUND2
       movaps   xmm4,xmm3
       cvtsi2ss xmm5,eax
       movaps   xmm4,xmm3
       movaps   xmm2,xmm1
       shufps   xmm5,xmm5,0
       shufps   xmm2,xmm2,11000001b
       shufps   xmm3,xmm3,11000001b
       shufps   xmm1,xmm1,11011010b
       shufps   xmm4,xmm4,11011010b
       subps    xmm2,xmm1
       subps    xmm3,xmm4
       mulps    xmm3,xmm5

       rcpps    xmm2,xmm2
       mulps    xmm3,xmm2
       cvtps2dq xmm3,xmm3
       movaps   xmm6,xmm2
       shufps   xmm3,xmm3,11000110b
       movaps   xmm7,.2_nv
       shufps   xmm6,xmm6,0
       movups   .dx23,xmm3
       subps    xmm7,.1_nv
       mulps    xmm7,xmm6
       movaps   .dn12,xmm7
       movlps   xmm7,.tx2d
       movlps   xmm5,.tx1d
       movhps   xmm7,.z2
       movhps   xmm5,.z1
       subps    xmm7,xmm5
       mulps    xmm7,xmm6
       movaps   .dtx12,xmm7
       movaps   xmm6,xmm2
       movaps   xmm7,.3_nv
       shufps   xmm6,xmm6,01010101b
       subps    xmm7,.1_nv
       mulps    xmm7,xmm6
       movaps   .dn13,xmm7
       movlps   xmm7,.tx3d
       movhps   xmm7,.z3
       subps    xmm7,xmm5
       mulps    xmm7,xmm6
       movaps   .dtx13,xmm7
       movaps   xmm6,xmm2
       movaps   xmm7,.3_nv
       shufps   xmm6,xmm6,10101010b
       subps    xmm7,.2_nv
       mulps    xmm7,xmm6
       movaps   .dn23,xmm7
       movlps   xmm7,.tx3d
       movlps   xmm5,.tx2d
       movhps   xmm7,.z3
       movhps   xmm5,.z2
       subps    xmm7,xmm5
       mulps    xmm7,xmm6
       movaps   .dtx23,xmm7

     ;  lea      edx,.txX_ptr
       movups   xmm7,.First_xres
       movups   .x_res,xmm7
       movzx    ebx,word [xres_var]
       mov      .x_res,ebx

       movlps   xmm0,.z1
       movlps   xmm3,.tx1d
       movss    .cz1d,xmm0
       movss    .cz2d,xmm0
       movaps   xmm0,.1_nv
       movlps   .ctx1d,xmm3
       movlps   .ctx2d,xmm3
       movaps   .cnv1,xmm0
       movaps   .cnv2,xmm0
       movaps    xmm1,xmm0
       movaps    xmm5,xmm3
       mov       eax,.txX_ptrd
       mov       .tex_ptr2,eax
       mov       ebx,.l_ptr
       mov       .call2,ebx

;       mov      ecx,.dx13
;       cmp      ecx,.dx12
;       jl       @f
;       movaps   xmm7,.dn13      ;
;       movaps   xmm4,.dn12      ;  with this code
;       movaps   .dn12,xmm7      ;  there is no need sort coords
;       movaps   .dn13,xmm4      ;  pro every iteraion of first calling loop
;       movaps   xmm7,.dtx13     ;
;       movaps   xmm4,.dtx12     ;
;       movaps   .dtx12,xmm7     ;
;       movaps   .dtx13,xmm4     ;
;       movlps   xmm7,.dx13      ;
;       movlps   xmm4,.dx12      ;
;       movss    .dx12,xmm7      ;
;       movss    .dx13,xmm4      ;
;     @@:

       movsx    eax,word .x1
       shl      eax,ROUND2
       mov      ebx,eax

       ; mov     edx,.dx13
       ; cmp     edx,.dx12
       ; jg      .second_cause
       ; lea     edx,.tx_ptr
       movsx     ecx,word .y1
       cmp       cx,.y2
       jge      .rpt_loop1_end
    .rpt_loop1:
       pushad
       sar      ebx,ROUND2
       sar      eax,ROUND2
        mov      edx,.txX_ptrd
        cmp      ecx,.y_min
        jl       .skp1
        cmp      ecx,.y_max
        jge      .skp1
       cmp      eax,ebx
       je       .skp1
       jl       .ho1
       xchg     eax,ebx
       movaps   xmm4,xmm0
       movaps   xmm7,xmm3
       movaps   xmm0,xmm1
       movaps   xmm3,xmm5
       movaps   xmm1,xmm4
       movaps   xmm5,xmm7
     .ho1:
       cmp       eax,.x_max
       jge        .skp1
        cmp       ebx,.x_min
        jle       .skp1
       movups   xmm2,.y_min
    ;   push     eax ebx
    ;   push     ecx ecx

    ;   movups   xmm6,[esp]
    ;   pcmpgtd  xmm6,xmm2
    ;   movmskps edx,xmm6
    ;   add      esp,16
    ;   cmp      edx,0101b
    ;   jne      .skp1
       movups   xmm6,.x_res

    ;   mov      edx,.tex_ptr2   ;.txX_ptr
       call     dword .call2    ; .l_ptr
     .skp1:
       popad
       movaps   xmm0,.cnv1
       movaps   xmm1,.cnv2
       movaps   xmm3,.ctx1d
       movaps   xmm5,.ctx2d
       addps    xmm0,.dn13
       addps    xmm1,.dn12
       addps    xmm3,.dtx13
       addps    xmm5,.dtx12
       add      eax,.dx13
       add      ebx,.dx12
       movaps   .cnv1,xmm0
       movaps   .cnv2,xmm1
       movaps   .ctx1d,xmm3
       movaps   .ctx2d,xmm5
       inc      ecx
       cmp      cx,.y2
       jl       .rpt_loop1

   .rpt_loop1_end:
       mov      bx,.y3
       movsx    ecx,word .y2
       cmp      cx,bx
       jge      .rpt_loop2_end
       mov      .y2,bx
       mov      ebx,.dx23
       mov      .dx12,ebx
       movsx    ebx,word .x2               ; eax - cur x1

       shl      ebx,ROUND2                 ; ebx - cur x2
       push     dword .z2
       pop      dword .cz2

       movaps   xmm3,.dn23
       movaps   xmm5,.dtx23
       movaps   xmm1,.2_nv
       movaps   xmm0,.cnv1
       movlps   xmm6,.tx2d
       movaps   .dn12,xmm3
       movaps   .dtx12,xmm5
       movaps   .cnv2,xmm1
       movlps   .ctx2d,xmm6

       movaps   xmm3,.ctx1d
       movaps   xmm5,.ctx2d

       jmp      .rpt_loop1
    .rpt_loop2_end:
       add      esp,350
       pop      ebp
ret

glass_tex_line:
; in:
;    xmm0 - normal vector 1
;    xmm1 - normal vect 2
;    xmm3 - lo -> hi tx1, ty1, z1 coords as dwords float
;    xmm5 - lo -> hi tx2, ty2, z2 coords as dwords float
;    xmm2 - lo -> hi y_min, y_max, x_min, x_max
;           as dword integers
;    xmm4 - normalized light vector ; not now
;    eax - x1
;    ebx - x2
;    ecx - y
;    edi - screen buffer
;    esi - stencil buffer filled with dd floats
;    edx - texture pointer (handle)
;    xmm6 - lo -> hi dword x_res, tex_shift, tex_x * 4,
;          tex size as dword integers

         push  ebp
         mov   ebp,esp
         sub   esp,(256+32)
         and   ebp,0xfffffff0
         sub   ebp,128             ; this way adress are shorter  -
                                   ; - inbetween   ebp +/- 128
        .n1             equ [ebp+16]   ; 16 bytes
        .xf             equ [ebp+48]
        .yf             equ [ebp+52]
        .xd             equ [ebp+56]
        .yd             equ [ebp+60]
        .dn             equ [ebp+64]   ; 16 bytes
        .y_min          equ [ebp+80]
        .y_max          equ [ebp+82]
        .x_min          equ [ebp+84]
        .x_max          equ [ebp+86]
        .zbuff          equ [ebp+88]
        .screen         equ [ebp+92]
        .aprox          equ [ebp+96]

        .lx1            equ [ebp+104]
        .y              equ [ebp+108]
        .lx2            equ [ebp+112]
        .lights_al      equ [ebp+116]

        .cnv            equ [ebp]      ; 16 bytes
        .z1             equ [ebp-8]
        .ty1            equ [ebp-12]
        .tx1            equ [ebp-16]
        .dz             equ [ebp-24]
        .dty            equ [ebp-28]
        .dtx            equ [ebp-32]
        .cz             equ [ebp-40]
        .cty            equ [ebp-44]
        .ctx            equ [ebp-48]

        .draw_flag      equ [ebp-121]
        .rph_bump_flag  equ [ebp-120]
        ; .the_one      equ [ebp-64]
        .aproxG         equ [ebp-80]
        .mask_255f      equ [ebp-96]
        .tex_m2         equ [ebp-112]
        .xy_res         equ [ebp-64]      ; init after .tex_m2 init/copy
        .shd_mark       equ word [ebp+36] ;
        .stencil_ptr    equ dword[ebp+32] ;
        .tx_ptr         equ [ebp-116]
        .x_res          equ [ebp-64]
        .tex_shift      equ [ebp-60]
        .tex_x4         equ [ebp-56]
        .tex_size       equ [ebp-52]

        mov       .y,ecx
        packssdw  xmm2,xmm2
        mov       .tx_ptr,edx
        movlps    .y_min,xmm2
        movaps    .x_res,xmm6

        mov       dh,[rph_bump_flag]
        mov       dl,[draw_flag]

        push      ecx
        mov       .draw_flag,dx
        mov       ecx,.tex_x4
        shr       ecx,2
        dec       ecx
        cvtsi2ss  xmm6,ecx
        shufps    xmm6,xmm6,0
        pop       ecx
        movaps    .n1,xmm0
        mov       .stencil_ptr,esi
        movaps    .tex_m2,xmm6
        mov       .lx1,eax
        mov       .lx2,ebx
        movaps    .tx1,xmm3
        ; shrink   adresses
        pcmpeqd   xmm6,xmm6
        psrld     xmm6,24
        cvtdq2ps  xmm6,xmm6
        movaps    .mask_255f,xmm6

        mov       dword .aproxG,1.5
        mov       dword .lights_al,lights_aligned
        ; mov       dword .lights_ale,lights_aligned_end
        sub       ebx,eax
        cvtsi2ss  xmm7,ebx
        rcpps     xmm7,xmm7
        shufps    xmm7,xmm7,0
        subps     xmm1,xmm0
        mulps     xmm1,xmm7
        movaps    .dn,xmm1
        subps     xmm5,xmm3
        mulps     xmm5,xmm7
        movaps    .dtx,xmm5
        mov       ebx,.lx1
        cmp       bx,.x_min     ; clipping on function4
        jge       .no_clip
        movzx     eax,word .x_min
        sub       eax,ebx
        cvtsi2ss  xmm7,eax
        shufps    xmm7,xmm7,0
        mulps     xmm5,xmm7
        mulps     xmm1,xmm7
        addps     xmm5,.tx1
        addps     xmm1,.n1
        movsx     eax,word .x_min
        movaps    .tx1,xmm5
        movaps    .n1,xmm1
        mov       dword .lx1,eax
      .no_clip:
        movzx     eax,word .x_max
        mov       ecx,.lx2
        cmp       ecx,eax
        cmovg     ecx,eax
        mov       eax,.x_res
        mul       dword .y
        add       eax,.lx1
        shl       eax,2
        add       edi,eax
        add       esi,eax
        xorps     xmm7,xmm7
        sub       ecx,.lx1
        movaps    xmm2,.tx1
        cld
   .ddraw:
        push      ecx
        ; every pix not in shd as default
        movhlps   xmm6,xmm2
        movhlps   xmm4,xmm2
        addps     xmm6,.aproxG
        subps     xmm4,.aproxG
        cmpnltss  xmm6,dword[esi]
        cmpnltss  xmm4,dword[esi]
        xorps     xmm6,xmm4
        movd      ebx,xmm6
        and       bx,10000b
        mov       .shd_mark,bx   ; mark if front  - 4th bit
        xorps     xmm4,xmm4
        cmp       .rph_bump_flag,byte 0
        je        @f
        call      calc_r_phg_bumps
      @@:
        xorps     xmm5,xmm5
        movaps    xmm7,.n1
        addps     xmm7,xmm4
        mulps     xmm7,xmm7        ; normalize
        haddps    xmm7,xmm7
        haddps    xmm7,xmm7
        rsqrtps   xmm7,xmm7
        mulps     xmm7,.n1
        movaps    .cnv,xmm7
        movaps    xmm6,xmm2
        minps     xmm6,.tex_m2     ; float  TEX_X-2,TEX_Y-2
        cvttps2dq xmm7,xmm6
        cvtdq2ps  xmm4,xmm7
        subps     xmm6,xmm4
        movlps    .xf,xmm6
        mov       eax,.lights_al   ; global
        push      ecx
        xor       ecx,ecx
        xorps     xmm4,xmm4
      .again_col:
        ; bt        .shd_mark,cx      ; }   shdow march related
        ; jc        .skp              ; }
        movaps    xmm0,[eax]          ; calc multple lights
        ; andps     xmm0,[z_abs_mask] ; calc absolute value
        mulps     xmm0,.cnv           ; last dword should be zeroed
        haddps    xmm0,xmm0
        haddps    xmm0,xmm0
        ; andps     xmm0,[abs_mask]   ; calc absolute value
        ; stencil
        bt        .shd_mark,4         ; if set - front
        jc        .no_reflective
        movaps    xmm4,xmm0
        mulps     xmm4,xmm4
        mulps     xmm4,xmm4
        mulps     xmm4,xmm4
        mulps     xmm4,xmm4
        mulps     xmm4,[eax+48]
        jmp       .skp
      .no_reflective:
        xorps     xmm3,xmm3
        maxps     xmm0,xmm3
        mulps     xmm0,[eax+16]
        addps     xmm4,xmm0
      .skp:
        addps     xmm4,[eax+32]
        maxps     xmm5,xmm4
        add       eax,64
        inc       ecx
        cmp       ecx,3
        jnz       .again_col
        pop       ecx
        minps     xmm5,.mask_255f
        cmp       .draw_flag,byte 17
        jne       @f
        cvtps2dq  xmm5,xmm5
        jmp       .glass
       @@:
        ; texture coords work
        sub       esp,8
        movlps    [esp],xmm7
        pop       eax ebx
        mov       cl,.tex_shift
        shl       ebx,cl         ; TEX_SHIFT
        add       eax,ebx
        and       eax,.tex_size  ; TEXTURE_SIZE
        shl       eax,2
        add       eax,.tx_ptr
        mov       ebx,eax
        add       ebx,.tex_x4    ; TEX_X*4
        movlps    xmm7,[eax]
        movlps    xmm6,[ebx]
        movlps    xmm1,.xf
        call      bi_filter      ; proc in '2bi_fil.inc' file
        mulps     xmm5,xmm7
        cvtps2dq  xmm5,xmm5
        psrld     xmm5,8
      .glass:
        movlps    xmm6,[edi]
        packssdw  xmm5,xmm5
        packuswb  xmm5,xmm5
        paddusb   xmm5,xmm6
        movss     [edi],xmm5

        pop       ecx
        add       edi,4
        add       esi,4
        inc       dword .lx1         ; cur x
        movaps    xmm0,.n1           ; cur normal
        addps     xmm0,.dn
        addps     xmm2,.dtx
        movaps    .n1,xmm0
        dec       ecx
        jnz       .ddraw

        add       esp,(256+32)
        pop       ebp

ret
;===========================================
;procedure need sse4 + extension
calc_r_phg_bumps:
        push      esi
        push      edi
        mov       edi,tex_m2
        movaps    xmm6,xmm2
        minps     xmm6,[edi]    ; tex_m2 ;    float  TEX_X-2,TEX_Y-2
        maxps     xmm6,[edi+16] ; f2x4
        cvttps2dq xmm7,xmm6

        cvtdq2ps  xmm4,xmm7
        subps     xmm6,xmm4
        movlps    .xf,xmm6      ; CAREFULL WITH xf DEFINITION !!!!!
        sub       esp,8
        movlps    [esp],xmm7
        pop       eax ebx
        shl       ebx,TEX_SHIFT
        add       eax,ebx
        ; float xf = frac x ; fractional part
        ; float yf = frac y
        movlps    xmm3,[edi+32] ; the_one ;  broadcasted dword 1.0
        subps     xmm3,.xf
        movhps    xmm3,.xf      ; xmm3 - lo-> 1.0-xf, 1.0-yf
                                ; xmm3 - hi-> xf, yf
        ; w1 = (1.0 - xf) * (1.0 - yf) ; weight
        ; w2 = (xf) * (1.0 - yf)
        ; w3 = (1.0 - xf) * (yf)
        ; w4 = (xf) * (yf)
        movaps    xmm7,xmm3
        shufps    xmm3,xmm3,10001000b
        shufps    xmm7,xmm7,11110101b
        mulps     xmm3,xmm7
        ; xmm3 =
        ;    p1 = (xd, yd)
        ;    p2 = (xd + 1, yd)
        ;    p3 = (xd,yd + 1)
        ;    p4 = (xd + 1,yd + 1)
        mov       ebx,bump_map
        movd      xmm7,ebx      ; .bump_ptr
        movd      xmm1,eax
        shufps    xmm7,xmm7,0
        shufps    xmm1,xmm1,0
        paddd     xmm1,[edi+48] ; bump_consts2:   dd -TEX_Y,-1,TEX_Y-1,TEX_Y*2
        andps     xmm1,[edi+64] ; bump_trunc
        paddd     xmm7,xmm1
        sub       esp,16
        movups    [esp],xmm7
        pop       esi
        lodsw
        movzx     ebx,ax
        pop       esi
        lodsd
        movd      xmm1,eax
        pop       esi
        lodsd
        pop       esi
        movd      xmm6,eax   ;      a1
                             ;     b@c3
        ror       ebx,16     ;     4d56
        lodsw                ;      78
        mov       bx,ax
        ror       ebx,16     ; @ - actual pix/bump
        movd      xmm4,ebx
                          ;  xmm1 =  b@c3
                          ;  xmm6 =  4d56
                          ;  xmm4 =  a178
                          ;  p1@=  b-c, a-d      ; x, y dev
                          ;  p2c=  @-3, 1-5
                          ;  p3d=  4-5, @-7
                          ;  p45=  d-6, c-8
        pslldq   xmm1,4
        pslldq   xmm6,8
        orps     xmm4,xmm1
        orps     xmm4,xmm6
        movaps   xmm1,xmm4
        ; xm1 = xm4 lo->hi =   ; a178b@c34d56
                               ; 0123456789
        pshufb   xmm4,[edi+80] ; b_shf1 ; 4 5 8 9 0 1 5 6
        movhlps  xmm1,xmm4
        psubb    xmm4,xmm1
        shufps   xmm6,xmm4,11101101b
        pmovsxbd xmm6,xmm6
        pmovsxbd xmm4,xmm4
        cvtdq2ps xmm4,xmm4
        cvtdq2ps xmm6,xmm6
        dpps     xmm4,xmm3,11110001b
        dpps     xmm6,xmm3,11110010b
        ; sub x,y =  p1*w1 + p2*w2 + p3*w3 + p4*w4
        orps     xmm4,xmm6
        mulps    xmm4,[edi+96]  ; norm_dev     ; calc deviation
        pop      edi
        pop      esi
ret
